#!/usr/bin/env ruby
# Copyright (C) 2012-2014 Cisco, Inc.

require 'json'
require 'pathname'

$stdout.sync = true # persistent process
opt = JSON::parse(ARGV[0], {symbolize_names: true})
opt.merge!( interval: 60, flaptime: 30, since: 900 ) {|k,a,b| a}

class MatchData; def to_h
  Hash[self.names.map {|n| [n.to_sym, self[n]]}]
end; end
class Array; def to_h; Hash[self]; end; end

# each of these returns a lambda
handler = {
  daemontools: ->(srv){
    o = srv.delete(:'-options') || {}
    mon = srv.delete(:'-monitor') || []
    cmd = [o[:svstat] || 'svstat'].flatten
    fn = mon.map {|n| Pathname.glob(n.to_s)}.flatten.
        map {|p| [p.to_s, p.basename.to_s.to_sym]}.to_h.
      merge(
        srv.keys.map {|n| [n, srv[n][:path] || '/service/' + n]}.to_h)
    ->() {
      p = IO.popen(cmd + fn.keys.map{|k| k.to_s})
      stat = p.readlines
      p.close
      stat.map {|l| l.chomp!
        info = l.match(%r{
          \A(?<key>\S+):\s+
          (?<state>up|down)
          (?:\s+\(pid\s+(?<pid>\d+)\))?\s+
          (?<duration>\d+)\s+seconds
          (?:,\s+normally\s(?<normally>\S+))?
        }x) or raise "cannot parse #{l}"
        info = info.to_h 
        name = fn[info.delete(:key)]
        [name, info]
      }.to_h
    }
  },
  smf: ->(srv) {
    mon = srv.delete(:'-monitor') || []
    mon.is_a?(Array) or raise "'-monitor' argument must be an array"
    o = srv.delete(:'-options') || {}
    cmd = [o[:svcs] || 'svcs'].flatten

    require 'date'
    
    return ->() {
      now = Time.now.to_i
      stat = IO.popen(cmd + ['-x'] + mon) {|fh| fh.readlines("\n\n") }.
        map {|svc|
          info = svc.match(%r{
            \A(?<fmri>\S+)\s.*?\n
            (?:\s+.*?\)\n)?
            \s+State:\s+(?<state>\S+)\s+
              since\s+(?<date>.*?)\n
          }x) or raise "cannot parse #{svc}"
          info = info.to_h
          duration = now - DateTime.parse(info[:date]).to_time.to_i
          # don't report negative duration
          duration = 0 if duration < 0
          # fault time needs to be non-zero to turn into negative metric
          duration = 1 if duration == 0 and info[:state] != 'online'
          [info[:fmri], {
            state: (info[:state] == 'online' ? 'up' : info[:state]),
            duration: duration,
          }]
        }.to_h
      raise "svcs error - #{$?.exitstatus}" unless $?.success?

      # TODO report disabled, but explicitly listed services?
      stat.keys.each {|fmri|
        stat.delete(fmri) if stat[fmri][:state] == 'disabled'
      }
      return stat
    }
  },
}

# initialize the required handlers
services = Hash[opt[:services].map {|k,v|
  how = handler[k] or raise "unknown service type #{k}"; [k, how[v]]}]

class AFlap
  def initialize (opts = {})
    @since    = opts[:since] || 900
    @flaptime = opts[:flaptime] || 5
    @history = {}
  end
  def log (k, i)
    l = @history[k] ||= []
    t = Time.now
    l.shift while(l.length > 0 and l[0][:time] < t - @since)
    logged = {
      time: t,
      duration: i[:duration].to_i,
      up: (i[:state] == 'up' ? true : false)
    }
    l.push(logged)
    count = 0;
    l.reverse.each {|i|
      break if i[:duration] > @flaptime or not(i[:up])
      count +=1}
    {flaps: count, up: (logged[:up] ? 1 : -1) * logged[:duration]}
  end
end

########################################################################
# mainloop
hist = AFlap.new(opt)
while true
  metrics = services.map {|k,v|
    data = v[].map {|n,i|
      [n, hist.log("#{k}|#{n}", i)]
    }.to_h
    [k, data]
  }.to_h

  # TODO move these sorts of things to a summarizing plugin
  if opt[:faults]
    faulted = metrics.map {|daemon,h| h.map {|service,i|
      i[:up] > 0 && not(i[:flaps] > 0) ? [] : ["#{daemon}|#{service}"]
    }}.flatten
    metrics = {} if opt[:faults] == 'only'
    metrics[:"-faults"] = faulted.count
    metrics[:_info] ||= {faults: faulted} if faulted.count > 0
  end

  puts JSON::generate(metrics)
  sleep(opt[:interval])
  if opt[:limit]
    opt[:limit] -= 1
    break if opt[:limit] == 0
  end
end
